#Importing Libries
import numpy as np
import pandas as pd
import statistics as stats
import seaborn as sns
import matplotlib.pyplot as plt
#Import Dataset
dataset=pd.read_csv("C:\\Users\\Hrushikesh Patel\\Desktop\\p\\project\\weatherAUS.csv")
dataset
| Date | Location | MinTemp | MaxTemp | Rainfall | Evaporation | Sunshine | WindGustDir | WindGustSpeed | WindDir9am | ... | Humidity9am | Humidity3pm | Pressure9am | Pressure3pm | Cloud9am | Cloud3pm | Temp9am | Temp3pm | RainToday | RainTomorrow | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2008-12-01 | Albury | 13.4 | 22.9 | 0.6 | NaN | NaN | W | 44.0 | W | ... | 71.0 | 22.0 | 1007.7 | 1007.1 | 8.0 | NaN | 16.9 | 21.8 | No | No |
| 1 | 2008-12-02 | Albury | 7.4 | 25.1 | 0.0 | NaN | NaN | WNW | 44.0 | NNW | ... | 44.0 | 25.0 | 1010.6 | 1007.8 | NaN | NaN | 17.2 | 24.3 | No | No |
| 2 | 2008-12-03 | Albury | 12.9 | 25.7 | 0.0 | NaN | NaN | WSW | 46.0 | W | ... | 38.0 | 30.0 | 1007.6 | 1008.7 | NaN | 2.0 | 21.0 | 23.2 | No | No |
| 3 | 2008-12-04 | Albury | 9.2 | 28.0 | 0.0 | NaN | NaN | NE | 24.0 | SE | ... | 45.0 | 16.0 | 1017.6 | 1012.8 | NaN | NaN | 18.1 | 26.5 | No | No |
| 4 | 2008-12-05 | Albury | 17.5 | 32.3 | 1.0 | NaN | NaN | W | 41.0 | ENE | ... | 82.0 | 33.0 | 1010.8 | 1006.0 | 7.0 | 8.0 | 17.8 | 29.7 | No | No |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 145455 | 2017-06-21 | Uluru | 2.8 | 23.4 | 0.0 | NaN | NaN | E | 31.0 | SE | ... | 51.0 | 24.0 | 1024.6 | 1020.3 | NaN | NaN | 10.1 | 22.4 | No | No |
| 145456 | 2017-06-22 | Uluru | 3.6 | 25.3 | 0.0 | NaN | NaN | NNW | 22.0 | SE | ... | 56.0 | 21.0 | 1023.5 | 1019.1 | NaN | NaN | 10.9 | 24.5 | No | No |
| 145457 | 2017-06-23 | Uluru | 5.4 | 26.9 | 0.0 | NaN | NaN | N | 37.0 | SE | ... | 53.0 | 24.0 | 1021.0 | 1016.8 | NaN | NaN | 12.5 | 26.1 | No | No |
| 145458 | 2017-06-24 | Uluru | 7.8 | 27.0 | 0.0 | NaN | NaN | SE | 28.0 | SSE | ... | 51.0 | 24.0 | 1019.4 | 1016.5 | 3.0 | 2.0 | 15.1 | 26.0 | No | No |
| 145459 | 2017-06-25 | Uluru | 14.9 | NaN | 0.0 | NaN | NaN | NaN | NaN | ESE | ... | 62.0 | 36.0 | 1020.2 | 1017.9 | 8.0 | 8.0 | 15.0 | 20.9 | No | NaN |
145460 rows × 23 columns
dataset.head()
| Date | Location | MinTemp | MaxTemp | Rainfall | Evaporation | Sunshine | WindGustDir | WindGustSpeed | WindDir9am | ... | Humidity9am | Humidity3pm | Pressure9am | Pressure3pm | Cloud9am | Cloud3pm | Temp9am | Temp3pm | RainToday | RainTomorrow | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2008-12-01 | Albury | 13.4 | 22.9 | 0.6 | NaN | NaN | W | 44.0 | W | ... | 71.0 | 22.0 | 1007.7 | 1007.1 | 8.0 | NaN | 16.9 | 21.8 | No | No |
| 1 | 2008-12-02 | Albury | 7.4 | 25.1 | 0.0 | NaN | NaN | WNW | 44.0 | NNW | ... | 44.0 | 25.0 | 1010.6 | 1007.8 | NaN | NaN | 17.2 | 24.3 | No | No |
| 2 | 2008-12-03 | Albury | 12.9 | 25.7 | 0.0 | NaN | NaN | WSW | 46.0 | W | ... | 38.0 | 30.0 | 1007.6 | 1008.7 | NaN | 2.0 | 21.0 | 23.2 | No | No |
| 3 | 2008-12-04 | Albury | 9.2 | 28.0 | 0.0 | NaN | NaN | NE | 24.0 | SE | ... | 45.0 | 16.0 | 1017.6 | 1012.8 | NaN | NaN | 18.1 | 26.5 | No | No |
| 4 | 2008-12-05 | Albury | 17.5 | 32.3 | 1.0 | NaN | NaN | W | 41.0 | ENE | ... | 82.0 | 33.0 | 1010.8 | 1006.0 | 7.0 | 8.0 | 17.8 | 29.7 | No | No |
5 rows × 23 columns
EDA (Exploratory Data Analysis)
Data Preparation
dataset.shape
(145460, 23)
dataset.columns
Index(['Date', 'Location', 'MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation',
'Sunshine', 'WindGustDir', 'WindGustSpeed', 'WindDir9am', 'WindDir3pm',
'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm',
'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am',
'Temp3pm', 'RainToday', 'RainTomorrow'],
dtype='object')
dataset.dtypes # every columns hava Series in pandas
Date object Location object MinTemp float64 MaxTemp float64 Rainfall float64 Evaporation float64 Sunshine float64 WindGustDir object WindGustSpeed float64 WindDir9am object WindDir3pm object WindSpeed9am float64 WindSpeed3pm float64 Humidity9am float64 Humidity3pm float64 Pressure9am float64 Pressure3pm float64 Cloud9am float64 Cloud3pm float64 Temp9am float64 Temp3pm float64 RainToday object RainTomorrow object dtype: object
dataset.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 145460 entries, 0 to 145459 Data columns (total 23 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Date 145460 non-null object 1 Location 145460 non-null object 2 MinTemp 143975 non-null float64 3 MaxTemp 144199 non-null float64 4 Rainfall 142199 non-null float64 5 Evaporation 82670 non-null float64 6 Sunshine 75625 non-null float64 7 WindGustDir 135134 non-null object 8 WindGustSpeed 135197 non-null float64 9 WindDir9am 134894 non-null object 10 WindDir3pm 141232 non-null object 11 WindSpeed9am 143693 non-null float64 12 WindSpeed3pm 142398 non-null float64 13 Humidity9am 142806 non-null float64 14 Humidity3pm 140953 non-null float64 15 Pressure9am 130395 non-null float64 16 Pressure3pm 130432 non-null float64 17 Cloud9am 89572 non-null float64 18 Cloud3pm 86102 non-null float64 19 Temp9am 143693 non-null float64 20 Temp3pm 141851 non-null float64 21 RainToday 142199 non-null object 22 RainTomorrow 142193 non-null object dtypes: float64(16), object(7) memory usage: 25.5+ MB
dataset.describe()
| MinTemp | MaxTemp | Rainfall | Evaporation | Sunshine | WindGustSpeed | WindSpeed9am | WindSpeed3pm | Humidity9am | Humidity3pm | Pressure9am | Pressure3pm | Cloud9am | Cloud3pm | Temp9am | Temp3pm | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 143975.000000 | 144199.000000 | 142199.000000 | 82670.000000 | 75625.000000 | 135197.000000 | 143693.000000 | 142398.000000 | 142806.000000 | 140953.000000 | 130395.00000 | 130432.000000 | 89572.000000 | 86102.000000 | 143693.000000 | 141851.00000 |
| mean | 12.194034 | 23.221348 | 2.360918 | 5.468232 | 7.611178 | 40.035230 | 14.043426 | 18.662657 | 68.880831 | 51.539116 | 1017.64994 | 1015.255889 | 4.447461 | 4.509930 | 16.990631 | 21.68339 |
| std | 6.398495 | 7.119049 | 8.478060 | 4.193704 | 3.785483 | 13.607062 | 8.915375 | 8.809800 | 19.029164 | 20.795902 | 7.10653 | 7.037414 | 2.887159 | 2.720357 | 6.488753 | 6.93665 |
| min | -8.500000 | -4.800000 | 0.000000 | 0.000000 | 0.000000 | 6.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 980.50000 | 977.100000 | 0.000000 | 0.000000 | -7.200000 | -5.40000 |
| 25% | 7.600000 | 17.900000 | 0.000000 | 2.600000 | 4.800000 | 31.000000 | 7.000000 | 13.000000 | 57.000000 | 37.000000 | 1012.90000 | 1010.400000 | 1.000000 | 2.000000 | 12.300000 | 16.60000 |
| 50% | 12.000000 | 22.600000 | 0.000000 | 4.800000 | 8.400000 | 39.000000 | 13.000000 | 19.000000 | 70.000000 | 52.000000 | 1017.60000 | 1015.200000 | 5.000000 | 5.000000 | 16.700000 | 21.10000 |
| 75% | 16.900000 | 28.200000 | 0.800000 | 7.400000 | 10.600000 | 48.000000 | 19.000000 | 24.000000 | 83.000000 | 66.000000 | 1022.40000 | 1020.000000 | 7.000000 | 7.000000 | 21.600000 | 26.40000 |
| max | 33.900000 | 48.100000 | 371.000000 | 145.000000 | 14.500000 | 135.000000 | 130.000000 | 87.000000 | 100.000000 | 100.000000 | 1041.00000 | 1039.600000 | 9.000000 | 9.000000 | 40.200000 | 46.70000 |
#DROP UNWANTED COLUMNS
drop = dataset.drop(['Evaporation','Sunshine'], axis=1)
drop.head(4)
| Date | Location | MinTemp | MaxTemp | Rainfall | WindGustDir | WindGustSpeed | WindDir9am | WindDir3pm | WindSpeed9am | ... | Humidity9am | Humidity3pm | Pressure9am | Pressure3pm | Cloud9am | Cloud3pm | Temp9am | Temp3pm | RainToday | RainTomorrow | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2008-12-01 | Albury | 13.4 | 22.9 | 0.6 | W | 44.0 | W | WNW | 20.0 | ... | 71.0 | 22.0 | 1007.7 | 1007.1 | 8.0 | NaN | 16.9 | 21.8 | No | No |
| 1 | 2008-12-02 | Albury | 7.4 | 25.1 | 0.0 | WNW | 44.0 | NNW | WSW | 4.0 | ... | 44.0 | 25.0 | 1010.6 | 1007.8 | NaN | NaN | 17.2 | 24.3 | No | No |
| 2 | 2008-12-03 | Albury | 12.9 | 25.7 | 0.0 | WSW | 46.0 | W | WSW | 19.0 | ... | 38.0 | 30.0 | 1007.6 | 1008.7 | NaN | 2.0 | 21.0 | 23.2 | No | No |
| 3 | 2008-12-04 | Albury | 9.2 | 28.0 | 0.0 | NE | 24.0 | SE | E | 11.0 | ... | 45.0 | 16.0 | 1017.6 | 1012.8 | NaN | NaN | 18.1 | 26.5 | No | No |
4 rows × 21 columns
#Rename Our Columns
renameData= dataset.rename(columns={'WindGustDir':'Wind_Gust_Dir',
'WindGustSpeed':'Wind_Gust_Speed',
'WindDir9am':"Wind_Dir_9am",
'WindDir3pm':'Wind_Dir_3pm'
})
renameData.head()
| Date | Location | MinTemp | MaxTemp | Rainfall | Evaporation | Sunshine | Wind_Gust_Dir | Wind_Gust_Speed | Wind_Dir_9am | ... | Humidity9am | Humidity3pm | Pressure9am | Pressure3pm | Cloud9am | Cloud3pm | Temp9am | Temp3pm | RainToday | RainTomorrow | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2008-12-01 | Albury | 13.4 | 22.9 | 0.6 | NaN | NaN | W | 44.0 | W | ... | 71.0 | 22.0 | 1007.7 | 1007.1 | 8.0 | NaN | 16.9 | 21.8 | No | No |
| 1 | 2008-12-02 | Albury | 7.4 | 25.1 | 0.0 | NaN | NaN | WNW | 44.0 | NNW | ... | 44.0 | 25.0 | 1010.6 | 1007.8 | NaN | NaN | 17.2 | 24.3 | No | No |
| 2 | 2008-12-03 | Albury | 12.9 | 25.7 | 0.0 | NaN | NaN | WSW | 46.0 | W | ... | 38.0 | 30.0 | 1007.6 | 1008.7 | NaN | 2.0 | 21.0 | 23.2 | No | No |
| 3 | 2008-12-04 | Albury | 9.2 | 28.0 | 0.0 | NaN | NaN | NE | 24.0 | SE | ... | 45.0 | 16.0 | 1017.6 | 1012.8 | NaN | NaN | 18.1 | 26.5 | No | No |
| 4 | 2008-12-05 | Albury | 17.5 | 32.3 | 1.0 | NaN | NaN | W | 41.0 | ENE | ... | 82.0 | 33.0 | 1010.8 | 1006.0 | 7.0 | 8.0 | 17.8 | 29.7 | No | No |
5 rows × 23 columns
renameData.columns
Index(['Date', 'Location', 'MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation',
'Sunshine', 'Wind_Gust_Dir', 'Wind_Gust_Speed', 'Wind_Dir_9am',
'Wind_Dir_3pm', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am',
'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm',
'Temp9am', 'Temp3pm', 'RainToday', 'RainTomorrow'],
dtype='object')
dataset.isna()
| Date | Location | MinTemp | MaxTemp | Rainfall | Evaporation | Sunshine | WindGustDir | WindGustSpeed | WindDir9am | ... | Humidity9am | Humidity3pm | Pressure9am | Pressure3pm | Cloud9am | Cloud3pm | Temp9am | Temp3pm | RainToday | RainTomorrow | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | False | False | False | False | False | True | True | False | False | False | ... | False | False | False | False | False | True | False | False | False | False |
| 1 | False | False | False | False | False | True | True | False | False | False | ... | False | False | False | False | True | True | False | False | False | False |
| 2 | False | False | False | False | False | True | True | False | False | False | ... | False | False | False | False | True | False | False | False | False | False |
| 3 | False | False | False | False | False | True | True | False | False | False | ... | False | False | False | False | True | True | False | False | False | False |
| 4 | False | False | False | False | False | True | True | False | False | False | ... | False | False | False | False | False | False | False | False | False | False |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 145455 | False | False | False | False | False | True | True | False | False | False | ... | False | False | False | False | True | True | False | False | False | False |
| 145456 | False | False | False | False | False | True | True | False | False | False | ... | False | False | False | False | True | True | False | False | False | False |
| 145457 | False | False | False | False | False | True | True | False | False | False | ... | False | False | False | False | True | True | False | False | False | False |
| 145458 | False | False | False | False | False | True | True | False | False | False | ... | False | False | False | False | False | False | False | False | False | False |
| 145459 | False | False | False | True | False | True | True | True | True | False | ... | False | False | False | False | False | False | False | False | False | True |
145460 rows × 23 columns
dataset.isna().sum()
Date 0 Location 0 MinTemp 1485 MaxTemp 1261 Rainfall 3261 Evaporation 62790 Sunshine 69835 WindGustDir 10326 WindGustSpeed 10263 WindDir9am 10566 WindDir3pm 4228 WindSpeed9am 1767 WindSpeed3pm 3062 Humidity9am 2654 Humidity3pm 4507 Pressure9am 15065 Pressure3pm 15028 Cloud9am 55888 Cloud3pm 59358 Temp9am 1767 Temp3pm 3609 RainToday 3261 RainTomorrow 3267 dtype: int64
dataset.duplicated()
0 False
1 False
2 False
3 False
4 False
...
145455 False
145456 False
145457 False
145458 False
145459 False
Length: 145460, dtype: bool
dataset.duplicated(subset=['Location'])
0 False
1 True
2 True
3 True
4 True
...
145455 True
145456 True
145457 True
145458 True
145459 True
Length: 145460, dtype: bool
dataset.loc[dataset.duplicated(subset=['Location'])].head()
| Date | Location | MinTemp | MaxTemp | Rainfall | Evaporation | Sunshine | WindGustDir | WindGustSpeed | WindDir9am | ... | Humidity9am | Humidity3pm | Pressure9am | Pressure3pm | Cloud9am | Cloud3pm | Temp9am | Temp3pm | RainToday | RainTomorrow | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 2008-12-02 | Albury | 7.4 | 25.1 | 0.0 | NaN | NaN | WNW | 44.0 | NNW | ... | 44.0 | 25.0 | 1010.6 | 1007.8 | NaN | NaN | 17.2 | 24.3 | No | No |
| 2 | 2008-12-03 | Albury | 12.9 | 25.7 | 0.0 | NaN | NaN | WSW | 46.0 | W | ... | 38.0 | 30.0 | 1007.6 | 1008.7 | NaN | 2.0 | 21.0 | 23.2 | No | No |
| 3 | 2008-12-04 | Albury | 9.2 | 28.0 | 0.0 | NaN | NaN | NE | 24.0 | SE | ... | 45.0 | 16.0 | 1017.6 | 1012.8 | NaN | NaN | 18.1 | 26.5 | No | No |
| 4 | 2008-12-05 | Albury | 17.5 | 32.3 | 1.0 | NaN | NaN | W | 41.0 | ENE | ... | 82.0 | 33.0 | 1010.8 | 1006.0 | 7.0 | 8.0 | 17.8 | 29.7 | No | No |
| 5 | 2008-12-06 | Albury | 14.6 | 29.7 | 0.2 | NaN | NaN | WNW | 56.0 | W | ... | 55.0 | 23.0 | 1009.2 | 1005.4 | NaN | NaN | 20.6 | 28.9 | No | No |
5 rows × 23 columns
#checking Example duplicated
dataset.query ('Location == "Albury" ').head(6)
| Date | Location | MinTemp | MaxTemp | Rainfall | Evaporation | Sunshine | WindGustDir | WindGustSpeed | WindDir9am | ... | Humidity9am | Humidity3pm | Pressure9am | Pressure3pm | Cloud9am | Cloud3pm | Temp9am | Temp3pm | RainToday | RainTomorrow | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2008-12-01 | Albury | 13.4 | 22.9 | 0.6 | NaN | NaN | W | 44.0 | W | ... | 71.0 | 22.0 | 1007.7 | 1007.1 | 8.0 | NaN | 16.9 | 21.8 | No | No |
| 1 | 2008-12-02 | Albury | 7.4 | 25.1 | 0.0 | NaN | NaN | WNW | 44.0 | NNW | ... | 44.0 | 25.0 | 1010.6 | 1007.8 | NaN | NaN | 17.2 | 24.3 | No | No |
| 2 | 2008-12-03 | Albury | 12.9 | 25.7 | 0.0 | NaN | NaN | WSW | 46.0 | W | ... | 38.0 | 30.0 | 1007.6 | 1008.7 | NaN | 2.0 | 21.0 | 23.2 | No | No |
| 3 | 2008-12-04 | Albury | 9.2 | 28.0 | 0.0 | NaN | NaN | NE | 24.0 | SE | ... | 45.0 | 16.0 | 1017.6 | 1012.8 | NaN | NaN | 18.1 | 26.5 | No | No |
| 4 | 2008-12-05 | Albury | 17.5 | 32.3 | 1.0 | NaN | NaN | W | 41.0 | ENE | ... | 82.0 | 33.0 | 1010.8 | 1006.0 | 7.0 | 8.0 | 17.8 | 29.7 | No | No |
| 5 | 2008-12-06 | Albury | 14.6 | 29.7 | 0.2 | NaN | NaN | WNW | 56.0 | W | ... | 55.0 | 23.0 | 1009.2 | 1005.4 | NaN | NaN | 20.6 | 28.9 | No | No |
6 rows × 23 columns
Feature Understandind
dataset['Date'].value_counts()
2013-11-12 49
2014-09-01 49
2014-08-23 49
2014-08-24 49
2014-08-25 49
..
2007-11-29 1
2007-11-28 1
2007-11-27 1
2007-11-26 1
2008-01-31 1
Name: Date, Length: 3436, dtype: int64
dataset['MaxTemp'].value_counts()
20.0 885
19.0 843
19.8 840
20.4 834
19.9 823
...
46.5 1
46.9 1
47.0 1
46.6 1
-2.4 1
Name: MaxTemp, Length: 505, dtype: int64
ax = dataset['MaxTemp'].value_counts() \
.head(10) \
.plot(kind='bar',color='r',title='Top 10 Maximum Tempareture')
ax.set_xlabel('<--Temparature-->')
ax.set_ylabel('<--Count-->')
Text(0, 0.5, '<--Count-->')
dataset['Humidity9am'].plot(kind='hist',bins=15,title= "Humidity in 9am")
<AxesSubplot:title={'center':'Humidity in 9am'}, ylabel='Frequency'>
Feature Relationships
dataset.plot(kind='scatter',
x='MaxTemp',
y='Humidity9am',color='c')
<AxesSubplot:xlabel='MaxTemp', ylabel='Humidity9am'>
dataset.plot(kind='scatter',
x='MaxTemp',
y='MinTemp',color='b')
<AxesSubplot:xlabel='MaxTemp', ylabel='MinTemp'>
dataset.plot(kind='scatter',
x='MaxTemp',
y='Pressure3pm',color='y')
<AxesSubplot:xlabel='MaxTemp', ylabel='Pressure3pm'>
sns.scatterplot(x='MaxTemp',
y='Pressure3pm',data =dataset)
<AxesSubplot:xlabel='MaxTemp', ylabel='Pressure3pm'>
sns.scatterplot(x='MaxTemp',
y='Pressure3pm',
hue='MaxTemp',
data =dataset)
<AxesSubplot:xlabel='MaxTemp', ylabel='Pressure3pm'>
#to see co-realation
df_corre = dataset[['MinTemp','MaxTemp','Rainfall','Humidity3pm','Pressure3pm']].dropna().corr()
df_corre
| MinTemp | MaxTemp | Rainfall | Humidity3pm | Pressure3pm | |
|---|---|---|---|---|---|
| MinTemp | 1.000000 | 0.727119 | 0.109141 | 0.028305 | -0.458110 |
| MaxTemp | 0.727119 | 1.000000 | -0.070787 | -0.495877 | -0.422165 |
| Rainfall | 0.109141 | -0.070787 | 1.000000 | 0.253789 | -0.125554 |
| Humidity3pm | 0.028305 | -0.495877 | 0.253789 | 1.000000 | 0.051285 |
| Pressure3pm | -0.458110 | -0.422165 | -0.125554 | 0.051285 | 1.000000 |
sns.heatmap(df_corre,annot=True)
<AxesSubplot:>
#Data Slicing of dependate values and independant values------here X is independent & y is dependented
#And here i have create 2D arry
x =dataset.iloc[:,[1,2,3,4,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21]].values
x
array([['Albury', 13.4, 22.9, ..., 16.9, 21.8, 'No'],
['Albury', 7.4, 25.1, ..., 17.2, 24.3, 'No'],
['Albury', 12.9, 25.7, ..., 21.0, 23.2, 'No'],
...,
['Uluru', 5.4, 26.9, ..., 12.5, 26.1, 'No'],
['Uluru', 7.8, 27.0, ..., 15.1, 26.0, 'No'],
['Uluru', 14.9, nan, ..., 15.0, 20.9, 'No']], dtype=object)
y = dataset.iloc[:,-1].values
y
array(['No', 'No', 'No', ..., 'No', 'No', nan], dtype=object)
# this method use for change 1D to 2D arry
Y = y.reshape(-1,1)
Y
array([['No'],
['No'],
['No'],
...,
['No'],
['No'],
[nan]], dtype=object)
Dealing With Invalid Dataset
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan,strategy ='most_frequent')
x=imputer.fit_transform(x)
Y=imputer.fit_transform(Y)
Y
array([['No'],
['No'],
['No'],
...,
['No'],
['No'],
['No']], dtype=object)
x
array([['Albury', 13.4, 22.9, ..., 16.9, 21.8, 'No'],
['Albury', 7.4, 25.1, ..., 17.2, 24.3, 'No'],
['Albury', 12.9, 25.7, ..., 21.0, 23.2, 'No'],
...,
['Uluru', 5.4, 26.9, ..., 12.5, 26.1, 'No'],
['Uluru', 7.8, 27.0, ..., 15.1, 26.0, 'No'],
['Uluru', 14.9, 20.0, ..., 15.0, 20.9, 'No']], dtype=object)
Encoding Dataset from charcter(str) to numeric(num)
from sklearn.preprocessing import LabelEncoder
lel = LabelEncoder()
x[:,0] = lel.fit_transform(x[:,0])
lel2 = LabelEncoder()
x[:,4] = lel2.fit_transform(x[:,4])
lel3 = LabelEncoder()
x[:,6] = lel3.fit_transform(x[:,6])
lel4 = LabelEncoder()
x[:,7] = lel4.fit_transform(x[:,4])
lel5 = LabelEncoder()
x[:,-1] = lel5.fit_transform(x[:,-1])
lel6 = LabelEncoder()
Y = lel6.fit_transform(Y)
C:\Users\Hrushikesh Patel\anaconda3\lib\site-packages\sklearn\preprocessing\_label.py:115: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). y = column_or_1d(y, warn=True)
x
array([[2, 13.4, 22.9, ..., 16.9, 21.8, 0],
[2, 7.4, 25.1, ..., 17.2, 24.3, 0],
[2, 12.9, 25.7, ..., 21.0, 23.2, 0],
...,
[41, 5.4, 26.9, ..., 12.5, 26.1, 0],
[41, 7.8, 27.0, ..., 15.1, 26.0, 0],
[41, 14.9, 20.0, ..., 15.0, 20.9, 0]], dtype=object)
Y
array([0, 0, 0, ..., 0, 0, 0])
Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x = sc.fit_transform(x)
x
array([[-1.53166617, 0.19132753, -0.04135977, ..., -0.01407077,
0.02310362, -0.52979545],
[-1.53166617, -0.75105231, 0.26874452, ..., 0.03244663,
0.387799 , -0.52979545],
[-1.53166617, 0.11279588, 0.35331842, ..., 0.62166712,
0.22733303, -0.52979545],
...,
[ 1.20928479, -1.06517892, 0.52246622, ..., -0.69632607,
0.65037966, -0.52979545],
[ 1.20928479, -0.68822699, 0.53656187, ..., -0.29317521,
0.63579185, -0.52979545],
[ 1.20928479, 0.42692249, -0.45013361, ..., -0.30868102,
-0.10818671, -0.52979545]])
Splitting Dataset into traningset and testing set
from sklearn.model_selection import train_test_split
x_train,x_test,Y_train,Y_test=train_test_split(x,Y,test_size = 0.20,random_state=0)
x_train
array([[ 0.22535368, 1.03946939, 0.07140543, ..., 0.68369032,
0.08145488, -0.52979545],
[ 1.42012717, -0.45263203, 0.11369237, ..., -0.41722163,
0.22733303, -0.52979545],
[ 0.50647685, -0.20133073, -0.14002932, ..., -0.06058818,
-0.02065982, 1.88752093],
...,
[ 1.0687232 , 0.75675544, 0.93124006, ..., 1.10234698,
1.07342629, -0.52979545],
[ 0.57675765, -0.04426743, -0.16822062, ..., 0.01694083,
-0.28324049, 1.88752093],
[ 1.63096955, -0.0285611 , -0.91529006, ..., -0.35519842,
-0.76463838, -0.52979545]])
from sklearn.ensemble import RandomForestClassifier
classifire = RandomForestClassifier(n_estimators=100,random_state=0)
classifire.fit(x_train,Y_train)
RandomForestClassifier(random_state=0)
# for checking how much score is accure of training dataset
classifire.score(x_train,Y_train)
0.9999226591502819
y_pre =classifire.predict(x_test)
y_pre
array([0, 0, 0, ..., 0, 0, 0])
from sklearn.metrics import accuracy_score
accuracy_check = accuracy_score( Y_test,y_pre)
accuracy_check
0.8538429808882166
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
scores = cross_val_score(DecisionTreeClassifier(),x_train,Y_train, cv=5)
scores
array([0.77537166, 0.78014093, 0.77489903, 0.77712371, 0.77914321])
scores.mean() # this is also accurecy
0.777335707372336
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(x_train,Y_train)
LogisticRegression()
lr.predict(x_test)
array([0, 0, 0, ..., 1, 0, 0])
lr.score(x_test,Y_test) #logistic regression accuaracy score is 84%
0.8426027773958477
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
poly_reg = PolynomialFeatures(degree=2)
poly_reg.fit(x_train)
X_train_poly = poly_reg.transform(x_train)
X_test_poly = poly_reg.transform(x_test)
X_train_poly.shape,X_test_poly.shape
((116368, 210), (29092, 210))
lr2 = LinearRegression()
lr2.fit(X_train_poly,Y_train)
LinearRegression()
lr2.score(X_test_poly,Y_test,)
0.34192818033542083
lr_pre = lr2.predict(X_test_poly)
lr2.score(X_test_poly,Y_test,)
0.34192818033542083
lr_pre = lr2.predict(X_test_poly)
lr_pre
array([0.0990449 , 0.28330241, 0.1391984 , ..., 0.4623271 , 0.23053751,
0.04138956])
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(Y_test,lr_pre)
rsme = np.sqrt(mse)
from pandas_profiling import ProfileReport
rp= ProfileReport(dataset)
rp
Summarize dataset: 0%| | 0/5 [00:00<?, ?it/s]
Generate report structure: 0%| | 0/1 [00:00<?, ?it/s]
Render HTML: 0%| | 0/1 [00:00<?, ?it/s]